In [103]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'
In [104]:
df=pd.read_csv('Cleaned_Clothing.csv')
In [105]:
df.head()
Out[105]:
| Customer ID | Customer Name | Product | Product Category | Size | Color | Cost (in NPR) | Quantity | Total Cost (in NPR) | Purchase Date | Store Location | Purchase Method | Foot Traffic | Gender | Product Rating | Customer Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CUST001 | Customer_1 | Dress | Casualwear | L | Black | 1800 | 4 | 7200 | 2024-04-30 | Bhaktapur | Visited Store | 132 | Male | 2 | New |
| 1 | CUST002 | Customer_2 | Shirt | Casualwear | L | Blue | 1737 | 2 | 3474 | 2023-12-17 | Lalitpur | Website | 471 | Female | 2 | New |
| 2 | CUST003 | Customer_3 | Skirt | Bottomwear | S | Black | 648 | 3 | 1944 | 2023-11-19 | Boudha | Visited Store | 488 | Male | 4 | Returning |
| 3 | CUST004 | Customer_4 | Blazer | Bottomwear | XL | Yellow | 2603 | 1 | 2603 | 2024-04-30 | Thamel | Visited Store | 309 | Male | 2 | Returning |
| 4 | CUST005 | Customer_5 | T-shirt | Outerwear | M | Blue | 780 | 1 | 780 | 2024-09-16 | Bhaktapur | Visited Store | 395 | Male | 2 | Returning |
In [106]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 100 non-null object 1 Customer Name 100 non-null object 2 Product 100 non-null object 3 Product Category 100 non-null object 4 Size 100 non-null object 5 Color 100 non-null object 6 Cost (in NPR) 100 non-null int64 7 Quantity 100 non-null int64 8 Total Cost (in NPR) 100 non-null int64 9 Purchase Date 100 non-null object 10 Store Location 100 non-null object 11 Purchase Method 100 non-null object 12 Foot Traffic 100 non-null int64 13 Gender 100 non-null object 14 Product Rating 100 non-null int64 15 Customer Type 100 non-null object dtypes: int64(5), object(11) memory usage: 12.6+ KB
In [107]:
# Sorting
df = df.sort_values("Customer Name")
df.head()
Out[107]:
| Customer ID | Customer Name | Product | Product Category | Size | Color | Cost (in NPR) | Quantity | Total Cost (in NPR) | Purchase Date | Store Location | Purchase Method | Foot Traffic | Gender | Product Rating | Customer Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CUST001 | Customer_1 | Dress | Casualwear | L | Black | 1800 | 4 | 7200 | 2024-04-30 | Bhaktapur | Visited Store | 132 | Male | 2 | New |
| 9 | CUST010 | Customer_10 | Shorts | Bottomwear | S | Red | 623 | 1 | 623 | 2024-05-13 | Kathmandu | Visited Store | 144 | Female | 5 | Returning |
| 99 | CUST100 | Customer_100 | T-shirt | Topwear | M | Red | 1287 | 5 | 6435 | 2024-01-13 | Bhaktapur | Visited Store | 150 | Male | 5 | New |
| 10 | CUST011 | Customer_11 | Blazer | Formalwear | M | Yellow | 2142 | 2 | 4284 | 2024-09-29 | Thamel | Visited Store | 418 | Male | 2 | Returning |
| 11 | CUST012 | Customer_12 | T-shirt | Topwear | L | Pink | 620 | 1 | 620 | 2024-02-21 | Thamel | Visited Store | 127 | Male | 5 | New |
In [108]:
df.isnull().sum()
Out[108]:
Customer ID 0 Customer Name 0 Product 0 Product Category 0 Size 0 Color 0 Cost (in NPR) 0 Quantity 0 Total Cost (in NPR) 0 Purchase Date 0 Store Location 0 Purchase Method 0 Foot Traffic 0 Gender 0 Product Rating 0 Customer Type 0 dtype: int64
In [109]:
# Dropping Unnecessary Columns
df = df.drop(["Customer ID", "Foot Traffic"], axis=1)
df.head()
Out[109]:
| Customer Name | Product | Product Category | Size | Color | Cost (in NPR) | Quantity | Total Cost (in NPR) | Purchase Date | Store Location | Purchase Method | Gender | Product Rating | Customer Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Customer_1 | Dress | Casualwear | L | Black | 1800 | 4 | 7200 | 2024-04-30 | Bhaktapur | Visited Store | Male | 2 | New |
| 9 | Customer_10 | Shorts | Bottomwear | S | Red | 623 | 1 | 623 | 2024-05-13 | Kathmandu | Visited Store | Female | 5 | Returning |
| 99 | Customer_100 | T-shirt | Topwear | M | Red | 1287 | 5 | 6435 | 2024-01-13 | Bhaktapur | Visited Store | Male | 5 | New |
| 10 | Customer_11 | Blazer | Formalwear | M | Yellow | 2142 | 2 | 4284 | 2024-09-29 | Thamel | Visited Store | Male | 2 | Returning |
| 11 | Customer_12 | T-shirt | Topwear | L | Pink | 620 | 1 | 620 | 2024-02-21 | Thamel | Visited Store | Male | 5 | New |
In [110]:
# Rename the column in the DataFrame
df.rename(columns={'Cost (in NPR)': 'Cost'}, inplace=True)
df.rename(columns={'Total Cost (in NPR)': 'Total_Cost'}, inplace=True)
In [111]:
#mapping visited store as 1 and website for 0
purchase_method_mapping= {"Visited Store": 1, "Website": 0}
df["Purchase Method"] = df["Purchase Method"].map(purchase_method_mapping)
In [112]:
df.head()
Out[112]:
| Customer Name | Product | Product Category | Size | Color | Cost | Quantity | Total_Cost | Purchase Date | Store Location | Purchase Method | Gender | Product Rating | Customer Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Customer_1 | Dress | Casualwear | L | Black | 1800 | 4 | 7200 | 2024-04-30 | Bhaktapur | 1 | Male | 2 | New |
| 9 | Customer_10 | Shorts | Bottomwear | S | Red | 623 | 1 | 623 | 2024-05-13 | Kathmandu | 1 | Female | 5 | Returning |
| 99 | Customer_100 | T-shirt | Topwear | M | Red | 1287 | 5 | 6435 | 2024-01-13 | Bhaktapur | 1 | Male | 5 | New |
| 10 | Customer_11 | Blazer | Formalwear | M | Yellow | 2142 | 2 | 4284 | 2024-09-29 | Thamel | 1 | Male | 2 | Returning |
| 11 | Customer_12 | T-shirt | Topwear | L | Pink | 620 | 1 | 620 | 2024-02-21 | Thamel | 1 | Male | 5 | New |
In [113]:
#mapping customer type as 1 for new and 0 for existing
customer_type_mapping= {
"New": 1,
"Returning": 0
}
df["Customer Type"] = df["Customer Type"].map(customer_type_mapping)
df.head()
Out[113]:
| Customer Name | Product | Product Category | Size | Color | Cost | Quantity | Total_Cost | Purchase Date | Store Location | Purchase Method | Gender | Product Rating | Customer Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Customer_1 | Dress | Casualwear | L | Black | 1800 | 4 | 7200 | 2024-04-30 | Bhaktapur | 1 | Male | 2 | 1 |
| 9 | Customer_10 | Shorts | Bottomwear | S | Red | 623 | 1 | 623 | 2024-05-13 | Kathmandu | 1 | Female | 5 | 0 |
| 99 | Customer_100 | T-shirt | Topwear | M | Red | 1287 | 5 | 6435 | 2024-01-13 | Bhaktapur | 1 | Male | 5 | 1 |
| 10 | Customer_11 | Blazer | Formalwear | M | Yellow | 2142 | 2 | 4284 | 2024-09-29 | Thamel | 1 | Male | 2 | 0 |
| 11 | Customer_12 | T-shirt | Topwear | L | Pink | 620 | 1 | 620 | 2024-02-21 | Thamel | 1 | Male | 5 | 1 |
In [114]:
#mapping male as 1 and female as 0
gender_mapping= {
'Male': 1,
'Female': 0
}
df["Gender"] = df["Gender"].map(gender_mapping)
In [115]:
df.info()
<class 'pandas.core.frame.DataFrame'> Index: 100 entries, 0 to 98 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer Name 100 non-null object 1 Product 100 non-null object 2 Product Category 100 non-null object 3 Size 100 non-null object 4 Color 100 non-null object 5 Cost 100 non-null int64 6 Quantity 100 non-null int64 7 Total_Cost 100 non-null int64 8 Purchase Date 100 non-null object 9 Store Location 100 non-null object 10 Purchase Method 100 non-null int64 11 Gender 100 non-null int64 12 Product Rating 100 non-null int64 13 Customer Type 100 non-null int64 dtypes: int64(7), object(7) memory usage: 11.7+ KB
In [116]:
#mapping the size
size_mapping= {
'S': 1,
'M': 2,
'L': 3,
'XL': 4
}
df["Size"] = df["Size"].map(size_mapping)
In [117]:
# Ratings Category (Low, Medium, High)
def categorize_rating(rating):
if rating <= 2.5:
return 'Low'
elif rating <= 3.5:
return 'Medium'
else:
return 'High'
df['Rating_Categoy'] = df['Product Rating'].apply(categorize_rating)
df.head()
Out[117]:
| Customer Name | Product | Product Category | Size | Color | Cost | Quantity | Total_Cost | Purchase Date | Store Location | Purchase Method | Gender | Product Rating | Customer Type | Rating_Categoy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Customer_1 | Dress | Casualwear | 3 | Black | 1800 | 4 | 7200 | 2024-04-30 | Bhaktapur | 1 | 1 | 2 | 1 | Low |
| 9 | Customer_10 | Shorts | Bottomwear | 1 | Red | 623 | 1 | 623 | 2024-05-13 | Kathmandu | 1 | 0 | 5 | 0 | High |
| 99 | Customer_100 | T-shirt | Topwear | 2 | Red | 1287 | 5 | 6435 | 2024-01-13 | Bhaktapur | 1 | 1 | 5 | 1 | High |
| 10 | Customer_11 | Blazer | Formalwear | 2 | Yellow | 2142 | 2 | 4284 | 2024-09-29 | Thamel | 1 | 1 | 2 | 0 | Low |
| 11 | Customer_12 | T-shirt | Topwear | 3 | Pink | 620 | 1 | 620 | 2024-02-21 | Thamel | 1 | 1 | 5 | 1 | High |
In [118]:
df['Product Category'].unique()
Out[118]:
array(['Casualwear', 'Bottomwear', 'Topwear', 'Formalwear', 'Outerwear'],
dtype=object)
In [119]:
#mapping product wear
product_category_mapping= {
'Casualwear': 1,
'Formalwear': 2,
'Bottomwear': 3,
'Outerwear': 4,
'Topwear': 5
}
df["Product Category"] = df["Product Category"].map(product_category_mapping)
Visualizations¶
In [120]:
#visualization based on the store location and the purchase method
import plotly.express as px
df['store'] = df['Store Location'].astype('category')
fig = px.scatter(df, x='store', y='Cost', color='Purchase Method',
title='Scatter Plot of Cost by Store Location and Purchase Method')
fig.show()
In [121]:
# Group by Gender and calculate the average Cost
grouped_df = df.groupby('Gender')['Cost'].mean().reset_index()
# Create a bar plot with specific colors for male and female
fig = px.bar(grouped_df, x='Gender', y='Cost', title='Average Cost by Gender',
color='Gender', color_discrete_map={'Male': 'blue', 'Female': 'pink'})
fig.show()
In [122]:
# Group by Store Location and sum the Total Cost
grouped_df = df.groupby('Store Location')['Total_Cost'].sum().reset_index()
# Create a bar plot
fig = px.bar(grouped_df, x='Store Location', y='Total_Cost', title='Total Cost by Store Location')
fig.show()
In [123]:
# Group by Purchase Method and sum the Quantity
grouped_df = df.groupby('Purchase Method')['Quantity'].sum().reset_index()
fig = px.pie(grouped_df, names='Purchase Method', values='Quantity', title='Total Quantity by Purchase Method', hole=0.3)
fig.show()
In [124]:
# Group by Product Category and Size, and sum the Total Cost
grouped_df = df.groupby(['Product Category', 'Size'])['Total_Cost'].sum().reset_index()
# Create a line plot
fig = px.line(grouped_df, x='Product Category', y='Total_Cost', color='Size',
title='Total Cost by Product Category and Size', markers=True)
fig.show()
In [125]:
# Group by Product Category and calculate the average Product Rating
grouped_df = df.groupby('Product Category')['Product Rating'].mean().reset_index()
fig = px.scatter(grouped_df, x='Product Category', y='Product Rating', title='Average Product Rating by Product Category',
size='Product Rating', color='Product Category')
fig.show()
In [126]:
# Group by Store Location and sum the Total Cost
grouped_df = df.groupby('Store Location')['Total_Cost'].sum().reset_index()
fig = px.bar(grouped_df, x='Store Location', y='Total_Cost', title='Total Cost by Store Location',color='Store Location')
fig.show()
In [127]:
# Group by Gender and Purchase Method, and count the occurrences
grouped_df = df.groupby(['Gender', 'Purchase Method']).size().reset_index(name='Count')
fig = px.pie(grouped_df, names='Purchase Method', values='Count',
title='Gender-wise Purchase Method Distribution', facet_col='Gender')
fig.show()
In [128]:
# Group by Product Category and calculate the average Product Rating
grouped_df = df.groupby('Product Category')['Product Rating'].mean().reset_index()
fig = px.bar(grouped_df, x='Product Category', y='Product Rating', title='Average Product Rating by Product Category',
color='Product Category')
fig.show()
In [129]:
df.columns
Out[129]:
Index(['Customer Name', 'Product', 'Product Category', 'Size', 'Color', 'Cost',
'Quantity', 'Total_Cost', 'Purchase Date', 'Store Location',
'Purchase Method', 'Gender', 'Product Rating', 'Customer Type',
'Rating_Categoy', 'store'],
dtype='object')
In [130]:
df.describe()
Out[130]:
| Product Category | Size | Cost | Quantity | Total_Cost | Purchase Method | Gender | Product Rating | Customer Type | |
|---|---|---|---|---|---|---|---|---|---|
| count | 100.00000 | 100.000000 | 100.000000 | 100.000000 | 100.00000 | 100.000000 | 100.000000 | 100.00000 | 100.000000 |
| mean | 3.15000 | 2.510000 | 2634.640000 | 3.230000 | 8592.31000 | 0.510000 | 0.440000 | 2.99000 | 0.480000 |
| std | 1.34371 | 1.029808 | 1383.482113 | 1.462356 | 6176.86699 | 0.502418 | 0.498888 | 1.46677 | 0.502117 |
| min | 1.00000 | 1.000000 | 533.000000 | 1.000000 | 620.00000 | 0.000000 | 0.000000 | 1.00000 | 0.000000 |
| 25% | 2.00000 | 2.000000 | 1326.750000 | 2.000000 | 3452.25000 | 0.000000 | 0.000000 | 2.00000 | 0.000000 |
| 50% | 3.00000 | 3.000000 | 2550.500000 | 3.000000 | 7446.00000 | 1.000000 | 0.000000 | 3.00000 | 0.000000 |
| 75% | 4.00000 | 3.000000 | 3888.500000 | 5.000000 | 12371.25000 | 1.000000 | 1.000000 | 4.00000 | 1.000000 |
| max | 5.00000 | 4.000000 | 4979.000000 | 5.000000 | 24695.00000 | 1.000000 | 1.000000 | 5.00000 | 1.000000 |
In [131]:
corr = df.corr(numeric_only=1)
corr
Out[131]:
| Product Category | Size | Cost | Quantity | Total_Cost | Purchase Method | Gender | Product Rating | Customer Type | |
|---|---|---|---|---|---|---|---|---|---|
| Product Category | 1.000000 | -0.019344 | 0.047035 | -0.063999 | 0.044899 | -0.009725 | 0.036163 | 0.149395 | 0.251515 |
| Size | -0.019344 | 1.000000 | 0.197383 | -0.018311 | 0.153153 | -0.117332 | -0.126617 | -0.103585 | -0.028911 |
| Cost | 0.047035 | 0.197383 | 1.000000 | 0.041151 | 0.722298 | 0.091731 | -0.112076 | -0.231336 | 0.065437 |
| Quantity | -0.063999 | -0.018311 | 0.041151 | 1.000000 | 0.639537 | -0.161267 | -0.070889 | 0.076431 | 0.013206 |
| Total_Cost | 0.044899 | 0.153153 | 0.722298 | 0.639537 | 1.000000 | -0.001962 | -0.160343 | -0.082812 | 0.052109 |
| Purchase Method | -0.009725 | -0.117332 | 0.091731 | -0.161267 | -0.001962 | 1.000000 | 0.143465 | -0.061544 | -0.059259 |
| Gender | 0.036163 | -0.126617 | -0.112076 | -0.070889 | -0.160343 | 0.143465 | 1.000000 | -0.021534 | -0.125809 |
| Product Rating | 0.149395 | -0.103585 | -0.231336 | 0.076431 | -0.082812 | -0.061544 | -0.021534 | 1.000000 | -0.075707 |
| Customer Type | 0.251515 | -0.028911 | 0.065437 | 0.013206 | 0.052109 | -0.059259 | -0.125809 | -0.075707 | 1.000000 |
In [132]:
# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='magma', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
model Training¶
In [133]:
Features = df[['Total_Cost','Purchase Method','Product Category','Size','Quantity']]
Target = df['Cost']
In [134]:
#splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Features, Target, test_size=0.2, random_state=0)
In [135]:
# Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
gbr = GradientBoostingRegressor()
# Train the model
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
In [136]:
# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Absolute Error: ", mae)
print("R2 Score: ", r2)
Mean Absolute Error: 200.90494036792384 R2 Score: 0.9729606885493439
In [137]:
# Visualization of predicted vs actual costs
fig_cost = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
type="line", line=dict(dash='dash'),
x0=y_test.min(), y0=y_test.min(),
x1=y_test.max(), y1=y_test.max()
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()
cost_list = [[predicted] for actual, predicted in zip(y_test, y_pred)]
In [138]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.2, max_depth=3, random_state=42,loss='squared_error')
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
cv_scores_r2 = cross_val_score(gbr, Features, Target, cv=5, scoring='r2')
print("Cross-validation R2 scores for Gradient Boosting Regressor:", cv_scores_r2)
print("Mean R2 score:", cv_scores_r2.mean())
# Perform cross-validation for Mean Absolute Error
cv_scores_mae = cross_val_score(gbr, Features, Target, cv=5, scoring='neg_mean_absolute_error')
cv_scores_mae = -cv_scores_mae # Convert to positive values
print("Cross-validation MAE scores for Gradient Boosting Regressor:", cv_scores_mae)
print("Mean MAE score:", cv_scores_mae.mean())
Cross-validation R2 scores for Gradient Boosting Regressor: [0.97341073 0.96872318 0.9516513 0.94338487 0.95744448] Mean R2 score: 0.9589229130407503 Cross-validation MAE scores for Gradient Boosting Regressor: [180.71856597 207.74159584 229.73779601 201.30292391 215.83694738] Mean MAE score: 207.06756582212523
Random Forest Regressor¶
In [139]:
from sklearn.ensemble import RandomForestRegressor
# basic model
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
In [140]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Absolute Error: ", mae)
print("R2 Score: ", r2)
Mean Absolute Error: 419.21450000000004 R2 Score: 0.8840462075478562
In [145]:
#plot tree diagram based model
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(rfr.estimators_[0], filled=True, feature_names=Features.columns)
plt.show()
Hyperparameter and cross validation
In [141]:
#hyperparameter tuning for random forest regressor
rfr_params = {
'n_estimators': 300,
'max_depth': 3,
'random_state': 42
}
rfr = RandomForestRegressor(**rfr_params)
# Fit the model
rfr.fit(X_train, y_train)
# Predict on the test set
y_pred_reg = rfr.predict(X_test)
# Perform cross-validation for R2 score
cv_scores_r2 = cross_val_score(rfr, Features, Target, cv=5, scoring='r2')
print("Cross-validation R2 scores for Random Forest Regressor:", cv_scores_r2)
print("Mean R2 score:", cv_scores_r2.mean())
# Perform cross-validation for Mean Absolute Error
cv_scores_mae = cross_val_score(rfr, Features, Target, cv=5, scoring='neg_mean_absolute_error')
cv_scores_mae = -cv_scores_mae
print("Cross-validation MAE scores for Random Forest Regressor:", cv_scores_mae)
print("Mean MAE score:", cv_scores_mae.mean())
Cross-validation R2 scores for Random Forest Regressor: [0.85507448 0.80738194 0.81554827 0.7633761 0.77008654] Mean R2 score: 0.8022934659816918 Cross-validation MAE scores for Random Forest Regressor: [403.70004968 564.66271815 468.18076216 559.51067536 584.78026227] Mean MAE score: 516.1668935224409
In [142]:
# Visualization of predicted vs actual costs after using Random Forest Regressor
fig_cost = px.scatter(x=y_test, y=y_pred_reg, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
type="line",
x0=y_test.min(),
y0=y_test.min(),
x1=y_test.max(),
y1=y_test.max(),
line=dict(color="Red"),
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()
In [143]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
fig = go.Figure(data=[go.Scatter(x=[1, 2, 3], y=[4, 5, 6])])
iplot(fig)
In [ ]: